###############################################################
####                                                       ####
####   Input: Debates data; various dictionary objects     ####
####   Output: Sentence-level LIWC scores                  ####
####                                                       ####
###############################################################

# Load libraries

library(quanteda) # v3.3.1
library(quanteda.dictionaries) # [github::kbenoit/quanteda.dictionaries] v0.4
library(data.table) # v1.14.8
library(text2vec) # v0.6.3
library(plyr) # v1.8.9

# Source dictionaries for talking "about women" 

load("data/dictionaries/liwc.Rdata")
load("data/debates.Rdata")
load("working/word_vectors_150.Rdata")

woman_words <- liwc$Female
man_words <- liwc$Male
dictionary_to_use_women <- dictionary(list(woman = woman_words,
                                           man = man_words))

# Drop pre-Blair years

debates <- debates[debates$parliamentary_term!="1992-1997"]

## Loop over years

year_out_list <- list()
i <- 0
y <- unique(debates$yearmon)[1]

for(y in unique(debates$yearmon)){
  
  print(y)
  i<-i+1
  
  debates_test <- debates[yearmon == y]
  
  # Remove all text between square parentheses
  debates_test$body <- gsub("\\s*\\[[^\\)]+\\]","", debates_test$body)
  
  # Replace hon. identifiers
  debates_test$body <- gsub(" hon."," hon ",debates_test$body)
  
  # Corpus for speeches 
  
  debates_test_corpus <- corpus(debates_test, text_field = "body")
  
  ## Convert speech corpus to sentences 
  
  debates_test_corpus <- corpus_reshape(debates_test_corpus, to = "sentences")
  
  ## Replace parliamentary "numbers" with placeholders so that they are not picked up as fact-based language
  
  raw_texts <- as.character(debates_test_corpus)
  
  debates_test_corpus <- gsub("clause \\d+|schedule \\d+|section \\d+|amendment \\d+|petition \\d+|e-petition \\d+|article \\d+|paragraph \\d+|act \\d+|bill \\d+|in paragraph|after paragraph|clause|schedule|amendment", "parlboilerplate", debates_test_corpus, ignore.case = T)
  
  ## Remove text that is actually hansard language, not part of the speeches
  
  to_keep <- !grepl("question put and agreed to|I beg to move,|Title of Report|TABLE      Day|Time for conclusion of proceedings|I beg to move|The annual amount of any periodical payment to any person by virtue of her being a widow of a past Member of the House of Commons|Ordered,  That, at the sitting|Additional costs allowance     The annual limit on the additional costs allowance|Resolved,  That|TABLE        Allotted    Day|Table      Proceedings|He is wrong to think that the majority     National Lottery|^Ordered,|^Publication Date|^Supplementary London allowance|^The petition states|^Following is the information|^Following is the full text of the petition|Table       |The following Members took and subscribed the Oath", debates_test_corpus, ignore.case = T)
  
  raw_texts <- raw_texts[to_keep]
  
  debates_test_corpus <- corpus_subset(debates_test_corpus, to_keep)
  
  debates_test_tokens <- tokens(debates_test_corpus)
  
  ## Replace "negation word" with "negation_word"
  
  debates_test_tokens <- tokens_compound(debates_test_tokens, data_dictionary_LSD2015)
  
  sentences_dfm <- dfm(debates_test_tokens)
  
  sentence_length <- ntoken(sentences_dfm)
  
  ## Calculate dictionary counts speech level measures 
  
  dfm_liwc <- dfm_lookup(sentences_dfm, dictionary_to_use_women)
  
  dict_scores <- as.data.frame(as.matrix(dfm_liwc))
  names(dict_scores) <- paste0("dic_",names(dict_scores))

  year_out <- data.table(epobject_id = docvars(sentences_dfm)$epobject_id,
                         section_id = docvars(sentences_dfm)$section_id,
                         person_id = docvars(sentences_dfm)$person_id,
                         sent = raw_texts,
                         adjusted_sent = as.character(debates_test_corpus), 
                         n_words = sentence_length,
                         dict_scores)
  
  year_out_list[[i]] <- year_out
  
}

## Save all sentence level scores in one object

dictionary_scores_liwc <- data.table(rbind.fill(year_out_list))

save(dictionary_scores_liwc, file = "working/dictionaries_sentence_liwc.Rdata")

